In [8]:
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
import os
import sys
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src', 'data')
sys.path.append(src_dir)
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics
from tqdm import tqdm
from scipy.ndimage.filters import gaussian_filter1d
from numpy.random import poisson
In [3]:
interim_dir = os.path.join(os.getcwd(), os.pardir, 'data', 'interim')
figures_dir = os.path.join(os.getcwd(), os.pardir, 'reports', 'figures')
In [4]:
%aimport scrape_buda
In [75]:
from scrape_buda import BudaRating
In [76]:
ratings = scrape_buda.BudaRating()
prefix = os.path.join(interim_dir, 'data20160521')
ratings.load_buda(prefix)
In [77]:
ratings.predicted_rating()
In [70]:
ratings.allteams[ratings.allteams['teamid'] == 39878]
Out[70]:
In [59]:
ratings.check_league_type(39878)
In [78]:
ratings.allteams = pd.read_csv(os.path.join(interim_dir, 'withselfcaptainensemble_ratings_numbers.csv'))
In [37]:
ratings.allteams.to_csv(os.path.join(interim_dir, 'withselfcaptainensemble_ratings_numbers.csv'))
In [87]:
year_index = (ratings.allteams['year'] == 2015) & (ratings.allteams['type'] == 'Hat')
this_year = ratings.allteams[year_index]
In [89]:
len(this_year)
Out[89]:
In [91]:
96 * 10
Out[91]:
In [32]:
sph_index = (ratings.allteams['season'] == 'Spring') & \
(ratings.allteams['type'] == 'Hat') & \
(ratings.allteams['divname'] == 'JP Mixed (4/3)') & \
(ratings.allteams['year'] >= 2010)
sph = ratings.allteams[sph_index]
In [58]:
sns.set_context('poster')
sns.set_style('white')
fig, ax = plt.subplots(figsize=(6, 5))
sns.distplot(sph['plusminus'], kde=False, bins=range(-12,12))
plt.ylabel('Number of Teams')
plt.xlabel('Average Plus/Minus per Game')
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, 'PlusMinusDistribution'))
In [94]:
whoa = (sph['plusminus']) >= 5
len(sph[whoa])
Out[94]:
In [93]:
whoa = (sph['plusminus']) <= -5
len(sph[whoa])
Out[93]:
In [29]:
len(sph)
Out[29]:
In [30]:
171 / 7.
Out[30]:
In [31]:
sph.head()
Out[31]:
In [32]:
30/171.
Out[32]:
In [42]:
years = range(2010, 2017)
for year in years:
sph_index1 = (ratings.allteams['season'] == 'Spring') & \
(ratings.allteams['type'] == 'Hat') & \
(ratings.allteams['divname'] == 'JP Mixed (4/3)') & \
(ratings.allteams['year'] == year)
sph1 = ratings.allteams[sph_index1]
print(year, len(sph1))
In [43]:
year = 2014
sph_index1 = (ratings.allteams['season'] == 'Spring') & \
(ratings.allteams['type'] == 'Hat') & \
(ratings.allteams['divname'] == 'JP Mixed (4/3)') & \
(ratings.allteams['year'] == year)
sph1 = ratings.allteams[sph_index1]
In [44]:
sph1
Out[44]:
In [45]:
11/7.
Out[45]:
There is one missing team in 2014: Team 17 Flesh Eating Virus. They went 5-2 with a total plus/minus of +11. So a per-game +/- of +1.57. No need to worry about this missing data point.
If I know the average number of goals scored in a game and I know the length of each game, then I can get an estimate of the average scoring rate.
Suppose average number of goals scored per game is 18. And that game length is 70 minutes.
In [17]:
goalpermin = 18/70.
goalpermin
Out[17]:
Then the average number of goals per minute is 0.257 goals/minute.
In [13]:
ok = [poisson(goalpermin, 70).sum() for i in range(171)]
In [14]:
sns.distplot(ok, kde=False)
Out[14]:
That looks like a pretty reasonable distribution of point totals per game.
Next, the question we are really interested in: suppose you are on a team that has a plus/minus average per game of -5. If the halves are split evenly in time, then at halftime you are typically behind by 2.5 points, and there are another 18 or so points to be played.
I am missing something here. Need some measure of the variance of plus/minus values. Time to sleep on it.
In [15]:
18 / 70.
Out[15]:
In [16]:
0.257 / 2
Out[16]:
An average team playing against an average team has an expected goal scoring rate of 0.257 / 2 = 0.128 goals/minute. A team that averages -5 plus/minus per game gives up 2.5 goals more pre 70 minutes and scores 2.5 goals less per 70 minutes while playing an average team.
In [95]:
gpm1 = 18 / 2 - 3.5
In [96]:
gpm2 = 18/2 + 1.5
In [98]:
gpm1 / 70.
Out[98]:
In [99]:
gpm2/70.
Out[99]:
In [19]:
ok1 = [poisson(gpm1) for i in range(171)]
In [20]:
ok2 = [poisson(gpm2) for i in range(171)]
In [21]:
sns.distplot(ok1, kde=False, bins=range(25))
sns.distplot(ok2, kde=False, bins=range(25))
Out[21]:
In [22]:
okdiff = np.array(ok1) - np.array(ok2)
oksum = np.array(ok1) + np.array(ok2)
In [23]:
sns.distplot(okdiff, kde=False)
Out[23]:
In [24]:
sns.distplot(oksum)
Out[24]:
In [25]:
np.mean(okdiff)
Out[25]:
In [26]:
ok = [poisson(gpm1, 70).sum() + poisson(gpm2, 70).sum() for i in range(171)]
In [27]:
np.mean(ok)
Out[27]:
In [11]:
def underdogwin(gpm1, gpm2, remaining_time, thresh):
ok1 = [poisson(gpm1, 70 - remaining_time).sum() for i in range(100)]
ok2 = [poisson(gpm2, 70 - remaining_time).sum() for i in range(100)]
okdiff = np.array(ok1) - np.array(ok2)
over5 = okdiff > thresh
return len(okdiff[over5])
In [29]:
nsim = 100
sim_come0 = []
sim_come1 = []
sim_come2 = []
sim_come3 = []
for isim in tqdm(range(nsim)):
come_from_behind0 = []
come_from_behind1 = []
come_from_behind2 = []
come_from_behind3 = []
remaining_times = range(70)
for remaining_time in remaining_times:
gpm1 = 18 / 2 / 70.
gpm2 = 18 / 2 / 70.
thresh = 0
wins = underdogwin(gpm1, gpm2, remaining_time, thresh)
come_from_behind0.append(wins)
thresh = 1
wins = underdogwin(gpm1, gpm2, remaining_time, thresh)
come_from_behind1.append(wins)
thresh = 2
wins = underdogwin(gpm1, gpm2, remaining_time, thresh)
come_from_behind2.append(wins)
thresh = 3
wins = underdogwin(gpm1, gpm2, remaining_time, thresh)
come_from_behind3.append(wins)
sim_come0.append(come_from_behind0)
sim_come1.append(come_from_behind1)
sim_come2.append(come_from_behind2)
sim_come3.append(come_from_behind3)
come_mean0 = np.mean(sim_come0, axis=0)
come_mean1 = np.mean(sim_come1, axis=0)
come_mean2 = np.mean(sim_come2, axis=0)
come_mean3 = np.mean(sim_come3, axis=0)
come_std0 = np.std(sim_come0, axis=0)
come_std1 = np.std(sim_come1, axis=0)
come_std2 = np.std(sim_come2, axis=0)
come_std3 = np.std(sim_come3, axis=0)
In [12]:
nsim = 100
sim_comes = []
threshes = range(4)
for thresh in threshes:
sim_come = []
for isim in tqdm(range(nsim)):
come_from_behind = []
remaining_times = range(70)
for remaining_time in remaining_times:
gpm1 = 18 / 2 / 70.
gpm2 = 18 / 2 / 70.
wins = underdogwin(gpm1, gpm2, remaining_time, thresh)
come_from_behind.append(wins)
sim_come.append(come_from_behind)
sim_comes.append(sim_come)
come_mean0 = np.mean(sim_comes[0], axis=0)
come_mean1 = np.mean(sim_comes[1], axis=0)
come_mean2 = np.mean(sim_comes[2], axis=0)
come_mean3 = np.mean(sim_comes[3], axis=0)
come_std0 = np.std(sim_comes[0], axis=0)
come_std1 = np.std(sim_comes[1], axis=0)
come_std2 = np.std(sim_comes[2], axis=0)
come_std3 = np.std(sim_comes[3], axis=0)
In [13]:
nsim = 100
sim_comes = []
threshes = range(4)
for thresh in threshes:
sim_come = []
for isim in tqdm(range(nsim)):
come_from_behind = []
remaining_times = range(70)
for remaining_time in remaining_times:
gpm1 = 18 / 2 / 70. - 3.5 / 70
gpm2 = 18 / 2 / 70. + 1.5 / 70
wins = underdogwin(gpm1, gpm2, remaining_time, thresh)
come_from_behind.append(wins)
sim_come.append(come_from_behind)
sim_comes.append(sim_come)
come_mean0_bad = np.mean(sim_comes[0], axis=0)
come_mean1_bad = np.mean(sim_comes[1], axis=0)
come_mean2_bad = np.mean(sim_comes[2], axis=0)
come_mean3_bad = np.mean(sim_comes[3], axis=0)
come_std0_bad = np.std(sim_comes[0], axis=0)
come_std1_bad = np.std(sim_comes[1], axis=0)
come_std2_bad = np.std(sim_comes[2], axis=0)
come_std3_bad = np.std(sim_comes[3], axis=0)
In [14]:
sns.set_context('poster')
sns.set_style('white')
f, ax = plt.subplots(figsize=(6, 5))
xarr = np.arange(len(come_mean2_bad), 0, -1)
yarr = gaussian_filter1d(come_mean2, 3)
sigarr = gaussian_filter1d(come_std2, 3)
y1 = yarr - sigarr
y2 = yarr + sigarr
# plt.fill_between(xarr, y1, y2, color='gray', alpha=0.4)
plt.plot(xarr, yarr, label='Team A equal to Team B')
yarr = gaussian_filter1d(come_mean2_bad, 3)
sigarr = gaussian_filter1d(come_std2_bad, 3)
y1 = yarr - sigarr
y2 = yarr + sigarr
# plt.fill_between(xarr, y1, y2, color='gray', alpha=0.4)
plt.plot(xarr, yarr, label='Team A much worse than Team B')
plt.ylim([0, 100])
plt.xlim([70,0])
plt.xlabel('Time Remaining [minutes]')
plt.ylabel('Percent Chance Team A Wins By Three')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, 'WinBy3Probability'))
# plt.plot(come_mean_bad)
In [223]:
sns.set_context('poster')
# xarr = range(len(come_mean2))
# yarr = gaussian_filter1d(come_mean2, 3)
# sigarr = gaussian_filter1d(come_std2, 3)
# y1 = yarr - sigarr
# y2 = yarr + sigarr
# plt.fill_between(xarr, y1, y2, color='gray', alpha=0.5)
# plt.plot(xarr, yarr)
xarr = range(len(come_mean0_bad))
yarr = gaussian_filter1d(come_mean0_bad, 3)
sigarr = gaussian_filter1d(come_std0_bad, 3)
y1 = yarr - sigarr
y2 = yarr + sigarr
plt.fill_between(xarr, y1, y2, color='gray', alpha=0.5)
plt.plot(xarr, yarr)
# plt.plot(come_mean_bad)
Out[223]:
In [10]:
sph_index = (ratings.allteams['season'] == 'Spring') & \
(ratings.allteams['type'] == 'Hat') & \
(ratings.allteams['divname'] == 'JP Mixed (4/3)') & \
(ratings.allteams['year'] >= 2010)
sph = ratings.allteams[sph_index]
In [84]:
sns.distplot(sph['n_exp_rating'], kde=False, bins=10)
sns.distplot(sph['n_cap_rating'], kde=False, bins=10)
sns.distplot(sph['n_capexp_rating'], kde=False, bins=10)
Out[84]:
In [80]:
sph['n_exp_rating'].mean()
Out[80]:
In [81]:
sph['n_cap_rating'].mean()
Out[81]:
In [85]:
sph['n_capexp_rating'].mean()
Out[85]:
In [82]:
sph['n_exp_rating'].median()
Out[82]:
In [83]:
sph['n_cap_rating'].median()
Out[83]:
In [86]:
sph['n_capexp_rating'].median()
Out[86]:
In [70]:
.75*16
Out[70]:
In [71]:
.69*16
Out[71]:
In [87]:
.84*16
Out[87]:
Simulation of two evenly matched teams. What is the distribution of 171 instances of observed average point differential over 7 games?
In [88]:
goalpermin
Out[88]:
In [23]:
avgoff = []
for isim in range(171):
ok1 = [poisson(goalpermin/2, 70).sum() for i in range(7)]
ok2 = [poisson(goalpermin/2, 70).sum() for i in range(7)]
off = np.array(ok1) - np.array(ok2)
avgoff.append(off.mean())
In [109]:
sns.set_context('poster')
sns.set_style('white')
fig, ax = plt.subplots(figsize=(6, 5))
sns.distplot(avgoff, kde=False, bins=range(-10, 11), color='gray', label='Equal Skill')
sns.distplot(sph['plusminus'], kde=False, bins=range(-10,11), label='Observed')
plt.ylabel('Number of Teams')
plt.xlabel('Average Plus/Minus per Game')
plt.tight_layout()
plt.legend()
plt.savefig(os.path.join(figures_dir, 'PlusMinusDistribution'))
Given equal teams, what is the likelihood of winning fewer than 10% of your games?
In [18]:
wins = []
for isim in range(171):
ok1 = [poisson(goalpermin/2, 70).sum() for i in range(7)]
ok2 = [poisson(goalpermin/2, 70).sum() for i in range(7)]
off = np.array(ok1) - np.array(ok2)
win_index = off > 0
wins.append(len(off[win_index]))
In [19]:
sns.distplot(wins, kde=False, bins=range(8))
Out[19]:
In [120]:
np.array(wins).sum()
Out[120]:
In [121]:
7 * 171
Out[121]:
In [122]:
564 / 1197.
Out[122]:
In [123]:
1/7.
Out[123]:
In [130]:
n1 = len(np.array(wins)[np.array(wins) < 1])
In [131]:
n1
Out[131]:
In [132]:
2 / 171.
Out[132]:
In [28]:
sns.set_context('poster')
sns.set_style('white')
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
ax = axes[0]
sns.distplot(avgoff, kde=False, bins=range(-10, 11), color='gray', label='Equal Skill', ax=ax)
sns.distplot(sph['plusminus'], kde=False, bins=range(-10,11), label='Observed', ax=ax)
ax.set_ylabel('Number of Teams')
ax.set_xlabel('Average Plus/Minus per Game')
ax.legend()
ax = axes[1]
xarr = np.arange(len(come_mean2_bad), 0, -1)
yarr = gaussian_filter1d(come_mean2, 3)
sigarr = gaussian_filter1d(come_std2, 3)
y1 = yarr - sigarr
y2 = yarr + sigarr
# plt.fill_between(xarr, y1, y2, color='gray', alpha=0.4)
ax.plot(xarr, yarr, label='Team A equal to Team B')
yarr = gaussian_filter1d(come_mean2_bad, 3)
sigarr = gaussian_filter1d(come_std2_bad, 3)
y1 = yarr - sigarr
y2 = yarr + sigarr
# plt.fill_between(xarr, y1, y2, color='gray', alpha=0.4)
ax.plot(xarr, yarr, label='Team A much worse than Team B')
ax.set_ylim([0, 100])
ax.set_xlim([70,0])
ax.set_xlabel('Time Remaining [minutes]')
ax.set_ylabel('Percent Chance Team A Wins By Three')
ax.legend()
plt.tight_layout(w_pad=2)
plt.savefig(os.path.join(figures_dir, 'PlusMinusDistribution_WinBy3Probability'))
# plt.plot(come_mean_bad)
An alternative way of investigating the probability of winning one game out of 7 or less: simple probability.
In [3]:
ways_to_win_one_or_less = 7 + 1
In [7]:
total_possible_outcomes = 2. ** 7
In [8]:
probability = ways_to_win_one_or_less / total_possible_outcomes
In [9]:
probability
Out[9]:
In [10]:
1 / 7.
Out[10]:
In [13]:
7 * 5 * 4 / 3 / 2. / 2**7
Out[13]:
In [14]:
corepart = 1.396 + 1.476 + 1.484 + 1.363 + 1.082 + 1.038
In [16]:
corepart / 6.
Out[16]:
In [17]:
lcp = 786. + 800 + 819 + 899 + 1032 + 947
lcp / 6
Out[17]:
In [18]:
rcp = 342. + 306 + 361 + 427 + 440 + 431
rcp / 6
Out[18]:
In [19]:
(13651 - 8467) / 8467.
Out[19]:
In [20]:
8467 * 1.6
Out[20]:
In [12]:
7/11.
Out[12]:
In [26]:
leaguelist = {'alabama': 3, 'arkansas': 3, 'alaska': 3, 'arizona': 3, 'california': 10, 'colorado': 7, 'connecticut': 4,
'delaware': 1, 'florida': 9, 'georgia': 5, 'hawaii': 1, 'idaho': 1, 'illinois': 4, 'indiana': 8, 'iowa': 5,
'kansas': 4, 'kentucky': 2, 'louisiana': 1, 'maine': 3, 'maryland': 5, 'massachusetts': 2,
'michigan': 6, 'minnesota': 2, 'mississippi': 2, 'missouri': 2, 'montana': 2, 'nebraska': 2, 'nevada': 2,
'new hampshire': 1, 'new jersey': 5, 'new mexico': 3, 'new york': 11, 'north carolina': 8, 'north dakota': 0,
'ohio': 4, 'oklahoma': 1, 'oregon': 6, 'pennsylvania': 6, 'rhode island': 1, 'south carolina': 5, 'south dakota': 1,
'tennessee': 4, 'texas': 6, 'utah': 4, 'vermont': 2, 'virginia': 6,
'washington': 3, 'west virginia': 0, 'wisconsin': 3, 'wyoming': 0}
In [27]:
totleagues = 0
for key in leaguelist:
totleagues += leaguelist[key]
In [28]:
totleagues
Out[28]:
In [29]:
yarr
In [33]:
Out[33]:
In [1]:
200/60.
Out[1]:
In [2]:
1/3. * (1800 + 1580 + 1400)
Out[2]:
In [3]:
16/6.
Out[3]:
In [4]:
.84*16
Out[4]:
In [6]:
import seaborn as sns
In [10]:
current_palette = sns.color_palette()
sns.palplot(current_palette)
In [11]:
current_palette
Out[11]:
In [ ]: